import warnings
warnings.filterwarnings("ignore")
import os
import time
import re
import pandas as pd
import numpy as np
import yellowbrick
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.font_manager import FontProperties
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from sklearn import metrics, preprocessing
from sklearn.svm import SVC
from sklearn.metrics import average_precision_score, precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, classification_report, roc_curve, auc, roc_auc_score, silhouette_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from scipy.cluster.hierarchy import linkage, dendrogram, cut_tree
from sklearn import decomposition
import scipy.stats as stats
from scipy.linalg import eigh
from math import factorial as f
from pylab import rcParams
rcParams['figure.figsize'] = 10, 15
%matplotlib inline
car_name = pd.read_csv("C:/Users/pri96/OneDrive/Documents/AI and ML PGP/Module 5 - Unsupervised Learning (Week 17 to Week 19)/Project/Car name.csv")
car_name.head()
| car_name | |
|---|---|
| 0 | chevrolet chevelle malibu |
| 1 | buick skylark 320 |
| 2 | plymouth satellite |
| 3 | amc rebel sst |
| 4 | ford torino |
*SOLUTION (1 B.)*¶
car_attributes = pd.read_json("C:/Users/pri96/OneDrive/Documents/AI and ML PGP/Module 5 - Unsupervised Learning (Week 17 to Week 19)/Project/Car-Attributes.json")
car_attributes.head()
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 1 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
| 2 | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
| 3 | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
| 4 | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
*SOLUTION (1 C.)*¶
Let's merge both these dataframes, car_name and car_attributes, based on their index values, i.e., we'll use index values of both dataframes as the key for merging them
car = pd.merge(car_name, car_attributes, left_index = True, right_index = True)
car.head()
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
| 2 | plymouth satellite | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
| 3 | amc rebel sst | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
| 4 | ford torino | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
print("There are", car.shape[0], "rows and", car.shape[1], "columns in the dataframe")
There are 398 rows and 9 columns in the dataframe
*SOLUTION (1 D.)*¶
car.describe(include = 'all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| car_name | 398 | 305 | ford pinto | 6 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| mpg | 398.0 | NaN | NaN | NaN | 23.514573 | 7.815984 | 9.0 | 17.5 | 23.0 | 29.0 | 46.6 |
| cyl | 398.0 | NaN | NaN | NaN | 5.454774 | 1.701004 | 3.0 | 4.0 | 4.0 | 8.0 | 8.0 |
| disp | 398.0 | NaN | NaN | NaN | 193.425879 | 104.269838 | 68.0 | 104.25 | 148.5 | 262.0 | 455.0 |
| hp | 398.0 | 94.0 | 150.0 | 22.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| wt | 398.0 | NaN | NaN | NaN | 2970.424623 | 846.841774 | 1613.0 | 2223.75 | 2803.5 | 3608.0 | 5140.0 |
| acc | 398.0 | NaN | NaN | NaN | 15.56809 | 2.757689 | 8.0 | 13.825 | 15.5 | 17.175 | 24.8 |
| yr | 398.0 | NaN | NaN | NaN | 76.01005 | 3.697627 | 70.0 | 73.0 | 76.0 | 79.0 | 82.0 |
| origin | 398.0 | NaN | NaN | NaN | 1.572864 | 0.802055 | 1.0 | 1.0 | 1.0 | 2.0 | 3.0 |
car.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 car_name 398 non-null object 1 mpg 398 non-null float64 2 cyl 398 non-null int64 3 disp 398 non-null float64 4 hp 398 non-null object 5 wt 398 non-null int64 6 acc 398 non-null float64 7 yr 398 non-null int64 8 origin 398 non-null int64 dtypes: float64(3), int64(4), object(2) memory usage: 28.1+ KB
Above table shows the 5 Point Summary of car dataframe which covers car attributes from the 1970s to early 1980s. It details variations in mileage, launch year, horsepower, weight, and acceleration. The distribution of cylinders and origins shows a diverse range of car types and manufacturing locations, providing valuable insights for analyzing trends and characteristics of the automotive industry in that time period.
Based on the provided 5 point summary, we can infer below pointers:
- Number of Instances and Attributes:
- The dataframe contains 398 records of automobile data
- There are 8 attributes out of which 3 are multivalued discrete - cylinders (cyl), model year (yr), origin (origin) and 5 continuous - displacement (disp), horsepower (hp), weight (wt), miles per gallon (mpg), acceleration (acc)
- Numerical Attributes Analysis:
- The no. of *cylinders (cyl)* in cars range from 3 to 8, with 4-cylinder engines being the most prevalent among the cars
- The *car-maker (origin)* takes values only 1,2 and 3, with 1 being the most frequent origin code from where cars are manufactured
- Both *disp and wt* seem to have a right skewed distribution with (Q3 - median) being greater than (median - Q1), and their means surpassing their medians
- *acc and yr* appear to have symmetric disrtibution where mean and median are equal and there is minimal variation
- Mean for *mpg* is equal to median so the distribution might be normal. We can validate this in later parts by having EDA
- Observation on *hp* shows that although it is expected to be a numerical value attribute, there are potentially some missing/unexpected values which contribute it to be an 'object' type feature, hindering further statistical analysis. There might be a need of further checks on this column
- Overall Observations:
- The dataset covers a diverse range of automotive attributes crucial for understanding performance and efficiency during the 1970s and early 1980s for various car models
- Inclusion of model year and origin might help in examining trends over time
- Weight and acceleration also can help in providing insight son handling and speed characteristics of different automobiles
print("Feature-wise percentage of missing values present in the dataframe:")
for column in car.select_dtypes(['float64', 'int64']).columns:
missing_values = car[column].isnull().sum()
total_values = len(car[column])
percentage = (missing_values/total_values) * 100
print(f"{column} - {percentage}%")
Feature-wise percentage of missing values present in the dataframe: mpg - 0.0% cyl - 0.0% disp - 0.0% wt - 0.0% acc - 0.0% yr - 0.0% origin - 0.0%
null_percentage = (car.isnull().sum() / len(car)) * 100
print("Feature-wise percentage of Null values:\n")
print(null_percentage)
Feature-wise percentage of Null values: car_name 0.0 mpg 0.0 cyl 0.0 disp 0.0 hp 0.0 wt 0.0 acc 0.0 yr 0.0 origin 0.0 dtype: float64
There are no null values in the dataframe. All columns have 398 records which is the total number of rows in the dataframe. We do not require any imputations.
*SOLUTION (2 B.)*¶
car[car.drop(columns = 'car_name', axis = 1).duplicated()]
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin |
|---|
We do not see any duplicate rows in the dataframe so we do not need any imputation. However, we do see that car_name doesn't have any impact on the efficiency of the vehicle. So, we can drop the 'car_name' column from our dataframe
car = car.drop(['car_name'], axis = 1)
car.head()
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 1 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
| 2 | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
| 3 | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
| 4 | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
*SOLUTION (2 C.)*¶
# PairPlot for all features
sns.pairplot(car, hue = 'origin', palette = 'bright')
plt.show()
Based on the above pair plot, we can see below points:
There is a strong negative correlation between miles per gallon (mpg) and weight (wt). This seems to be valid as heavier cars tend to have lower fuel efficiency
Similarly, there's a negative correlation between mpg and engine displacement (disp), which might state that cars with larger engines tend to be less fuel efficient
There is a positive correlation between weight and engine displacement which means that heavier cars typically have larger engines
Acceleration seems less strongly correlated with other features like weight and displacement compared to mpg. This might suggest that factors other than weight and engine size influence acceleration
Cars with different numbers of cylinders show distinct clusters in the scatter plots. Cars with more cylinders generally have lower mpg
*SOLUTION (2 D.)*¶
plt.figure(figsize = (10, 6))
scatter_plot = sns.scatterplot(data = car, x = 'wt', y = 'disp', hue = 'cyl', palette = 'bright', s = 100, alpha = 0.7)
plt.title('Scatterplot of Weight vs Displacement')
plt.xlabel('Weight (wt)')
plt.ylabel('Displacement (disp)')
plt.legend(title = 'Cylinders')
plt.grid(True)
plt.show()
*SOLUTION (2 E.)*¶
Based on the above scatter plot, we can see that:
- There is a clear positive correlation between the weight of a car and it's displacement, which says that as the weight of the car increase, the engine displacement also increases along
2. The distribution with different cylinders says:
* Cars with 3 cylinders (blue) are rare and generally have low displacement and weight
* Cars with 4 cylinders (orange) are clustered towards the lower end of both weight and displacement, indicating they are generally lighter and have smaller engines
* Cars with 5 cylinders (green) are sparse and are too much scattered. They fall in the middle range of weight and displacement
* Cars with 6 cylinders (red) are in the mid to upper range of weight and displacement, showing moderate engine size and weight
* Cars with 8 cylinders (purple) are concentrated at the higher end of the spectrum for both weight and displacement, indicating they are heavier with larger engines. We also see outliers for these types of cars
3. The cluster distribution of weight and displacement for different numbers of cylinders in cars suggests that cylinder count plays a significant role in influencing weight and displacement
4. There are a few outliers particularly among the 8 cylinder cars. This can show that there are some cars which have a higher displacement relative to their weight. These might represent high-performance vehicles with large engines but not excessively heavy
*SOLUTION (2 F.)*¶
plt.figure(figsize = (10, 6))
scatter_plot = sns.scatterplot(data = car, x = 'wt', y = 'mpg', hue = 'cyl', palette = 'bright', s = 100, alpha = 0.7)
plt.title('Scatterplot of Weight vs Miles per Gallon')
plt.xlabel('Weight (wt)')
plt.ylabel('Miles Per Gallon (mpg)')
plt.legend(title = 'Cylinders')
plt.grid(True)
plt.show()
*SOLUTION (2 G.)*¶
Based on the above scatter plot, we can see that:
- There is a strong negative correlation between the weight of a car and it's efficiency (mpg), which says that as the weight of the car increase, its fuel efficiency decreases
2. The distribution with different cylinders says:
* Cars with 3 cylinders (blue) are very few and have relatively have high mpg with low weight
* Cars with 4 cylinders (orange) generally have the highest mpg and are lighter when compared to their later halfs. They form a dense cluster in the lower weight and higher mpg range
* Cars with 5 cylinders (green) are rare and scattered, with moderate mpg and weight. They are mostly in the mid range, though
* Cars with 6 cylinders (red) have moderate to low mpg and are heavier compared to 4-cylinder cars
* Cars with 8 cylinders (purple) are the heaviest and have the lowest mpg, forming a distinct cluster at the higher weight and lower mpg range in the plot
3. The distinct clusters based on the number of cylinders are more prominent for 4, 6, and 8 cylinders, indicating that the number of cylinders is a significant factor in both weight and fuel efficiency
Overall, we see that heavier cars with more cylinders generaly have lower fuel efficiency with mpg often below 20 whereas lighter cars (mostly with 4 cylinders) acheive higher fuel efficiency with mpg values mostly above 25
*SOLUTION (2 H.)*¶
To start off, the first thing we see since beginning is that all the columns in *car* dataframe are numeric, except horse power ('hp').
The column is of 'object' type which suggests that there can be some irrelevant values not matching with numeric standards. Let's check them
car['hp'].unique()
array([130, 165, 150, 140, 198, 220, 215, 225, 190, 170, 160, 95, 97, 85,
88, 46, 87, 90, 113, 200, 210, 193, '?', 100, 105, 175, 153, 180,
110, 72, 86, 70, 76, 65, 69, 60, 80, 54, 208, 155, 112, 92, 145,
137, 158, 167, 94, 107, 230, 49, 75, 91, 122, 67, 83, 78, 52, 61,
93, 148, 129, 96, 71, 98, 115, 53, 81, 79, 120, 152, 102, 108, 68,
58, 149, 89, 63, 48, 66, 139, 103, 125, 133, 138, 135, 142, 77, 62,
132, 84, 64, 74, 116, 82], dtype=object)
car[car['hp'] == '?']
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 32 | 25.0 | 4 | 98.0 | ? | 2046 | 19.0 | 71 | 1 |
| 126 | 21.0 | 6 | 200.0 | ? | 2875 | 17.0 | 74 | 1 |
| 330 | 40.9 | 4 | 85.0 | ? | 1835 | 17.3 | 80 | 2 |
| 336 | 23.6 | 4 | 140.0 | ? | 2905 | 14.3 | 80 | 1 |
| 354 | 34.5 | 4 | 100.0 | ? | 2320 | 15.8 | 81 | 2 |
| 374 | 23.0 | 4 | 151.0 | ? | 3035 | 20.5 | 82 | 1 |
We see that there are 6 records in *car* dataframe with an unexpected value (special character - '?') in 'hp' column. Let's plan to impute it with the median of this column.
Before that, let's convert it to numeric column. This will convert the non numeric special character to 'NaN' values and we'll be able to identify null values
car['hp'] = pd.to_numeric(car['hp'], errors='coerce')
car.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 398 non-null float64 1 cyl 398 non-null int64 2 disp 398 non-null float64 3 hp 392 non-null float64 4 wt 398 non-null int64 5 acc 398 non-null float64 6 yr 398 non-null int64 7 origin 398 non-null int64 dtypes: float64(4), int64(4) memory usage: 25.0 KB
Now, we see that after converting 'hp' column to numeric ('float64') value, there are 6 null values in the column which need to be imputed with median
Another round of check which needs to be covered is to run a check on all other features for any unexpected values (which is highly unlikely).
Let's start with the code
for column in car.columns:
unique_values = car[column].unique()
unexpected_values = []
for value in unique_values:
if pd.isna(value): # Checking for NaN values
unexpected_values.append(value)
elif not pd.api.types.is_numeric_dtype(car[column]) and not isinstance(value, str):
unexpected_values.append(value) # Checking for non-string non-numeric values, which is highly unlikely
if unexpected_values:
print(f"Column '{column}' has unexpected values: {unexpected_values}")
# Checking for unexpected values across all datapoints (rows)
unexpected_rows = car[car.isnull().any(axis = 1)]
if not unexpected_rows.empty:
print(f"\nAnd those unexpected values across below {len(unexpected_rows)} rows:\n\n{unexpected_rows}")
else:
print("No unexpected values found across datapoints.")
print("Imputing these with Median - ", car['hp'].median())
Column 'hp' has unexpected values: [nan]
And those unexpected values across below 6 rows:
mpg cyl disp hp wt acc yr origin
32 25.0 4 98.0 NaN 2046 19.0 71 1
126 21.0 6 200.0 NaN 2875 17.0 74 1
330 40.9 4 85.0 NaN 1835 17.3 80 2
336 23.6 4 140.0 NaN 2905 14.3 80 1
354 34.5 4 100.0 NaN 2320 15.8 81 2
374 23.0 4 151.0 NaN 3035 20.5 82 1
Imputing these with Median - 93.5
After running for all features, we see that only 'hp' column has missing values. Let's impute these 6 with the median
car['hp'].replace(np.nan, car['hp'].median(), inplace = True)
# Re-checking for unexpected values across all datapoints (rows)
unexpected_rows = car[car.isnull().any(axis = 1)]
if not unexpected_rows.empty:
print(f"\nAnd those unexpected values across below {len(unexpected_rows)} rows:\n\n{unexpected_rows}")
else:
print("No unexpected values found across datapoints.")
No unexpected values found across datapoints.
# plot the heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(car.corr(), annot = True, vmin = -1, vmax = 1, fmt = ".2f", cmap = "Spectral")
plt.show()
Let's first scale the dataframe so we have an equal weight to all the features and all contribute equally to the clustering process making it more robust and meaningful
standard_scaler = StandardScaler()
car_scaled = standard_scaler.fit_transform(car)
car_scaled
array([[-0.7064387 , 1.49819126, 1.0906037 , ..., -1.29549834,
-1.62742629, -0.71514478],
[-1.09075062, 1.49819126, 1.5035143 , ..., -1.47703779,
-1.62742629, -0.71514478],
[-0.7064387 , 1.49819126, 1.19623199, ..., -1.65857724,
-1.62742629, -0.71514478],
...,
[ 1.08701694, -0.85632057, -0.56103873, ..., -1.4407299 ,
1.62198339, -0.71514478],
[ 0.57460104, -0.85632057, -0.70507731, ..., 1.10082237,
1.62198339, -0.71514478],
[ 0.95891297, -0.85632057, -0.71467988, ..., 1.39128549,
1.62198339, -0.71514478]])
cluster_range = range(2,11) # To apply KMeans clustering for 2 to 10 clusters
wcss = [] # within-cluster sum of squares
silhouette_scores = [] # to calculate silhouette score for each cluster value
# Apply K-Means clustering for each number of clusters
for k in cluster_range:
kmeans = KMeans(n_clusters = k, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 42)
kmeans.fit(car_scaled)
wcss.append(kmeans.inertia_)
labels = kmeans.labels_
silhouette_avg = silhouette_score(car_scaled, labels)
silhouette_scores.append({'Number of Clusters': k, 'Silhouette Score': format(silhouette_avg, '.5f')})
# Print WCSS values
print("\nWCSS Values:", wcss)
# Print inertia
print("\nFinal Inertia:", format(kmeans.inertia_,'.4f'))
# Print number of iterations
print("\nNumber of Iterations to Converge:", kmeans.n_iter_)
print("\n\nSilhouette Scores for different clusters - ")
silhouette_df = pd.DataFrame(silhouette_scores)
silhouette_df
WCSS Values: [1588.592456791863, 1190.0436525504851, 988.0688340114527, 829.7157874488786, 750.8737927045283, 681.6526858914018, 633.4035136870295, 596.7852585282914, 555.7346945917886] Final Inertia: 555.7347 Number of Iterations to Converge: 8 Silhouette Scores for different clusters -
| Number of Clusters | Silhouette Score | |
|---|---|---|
| 0 | 2 | 0.41489 |
| 1 | 3 | 0.32467 |
| 2 | 4 | 0.31197 |
| 3 | 5 | 0.33297 |
| 4 | 6 | 0.33652 |
| 5 | 7 | 0.29218 |
| 6 | 8 | 0.29054 |
| 7 | 9 | 0.26592 |
| 8 | 10 | 0.27413 |
From the silhouette scores, we see that:
- The highest silhouette score is for 2 clusters
- The score drops for 3 clusters and slightly decreases further for 4 clusters
- We see a local maximum at 5 and 6 clusters again. Though it is lower than the score for 2 clusters, it's observed to be higher than 3
However, let's check on the Elbow Method as well to see where do we see the WCSS value starting to decrease
*SOLUTION (3 B.)*¶
# Plotting the inertia (within-cluster sum of squares) to determine the optimal number of clusters through the Elbow Method
plt.figure(figsize = (10, 6))
plt.plot(cluster_range, wcss, marker = 'o', linestyle = '--', label = 'WCSS')
plt.title('The Elbow Method - To determine Optimal number of Clusters')
plt.xlabel('Number of clusters')
#plt.xticks(cluster_range)
plt.ylabel('WCSS')
plt.grid(True)
plt.show()
The identification of Elbow Point can be done where the Within-Cluster Sum of Squares (WCSS) starts to deminish at a slower rate. We can say it as the optimal number of clusters.
Here in this plot above, we can notice drop in WCSS twice, one at k = 3 and another at k = 4
- From k = 2 to k = 3, we see a noticeable drop in WCSS value
- Again from k = 3 to k = 4, a significant drop is observed in WCSS
Given this analysis, both k = 3 and k = 4 could be considered as potential elbow points. However, k = 3 might be more appropriate as the true elbow point and we can consider this value as the optimal choice because it marks the point where the reduction rate first starts to diminish more noticeably.
*SOLUTION (3 C.)*¶
Analysis on the possible Elbow points, we have below conclusions:
- From Elbow method, we see the primary optimal elbow point as 3 clusters as the WCSS drop significantly slows down after this point
- We calculate the silhouette scores for different numbers of clusters. Higher scores indicate better-defined clusters
- From Silhouette score, 2 clusters have the highest silhouette score, but considering the Elbow Method and the overall trend, 3 clusters is a reasonable compromise
- We also see 2 secondary points, 5 and 6 clusters, as they have relatively higher silhouette scores than 3 clusters, making them secondary considerations
We can check on the visualisation of these below:
# Calculating wcss again because it's giving error somehow with mismatched datatypes
wcss = []
for i in range(2, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 42)
kmeans.fit(car_scaled)
wcss.append(kmeans.inertia_)
# Elbow points based on your previous analysis
elbow_points = [3, 5, 6]
# Create the plot
plt.figure(figsize=(8, 3))
plt.plot(range(1, len(wcss) + 1), wcss, marker = 'o', linestyle = '-')
# Annotate the elbow points
for point in elbow_points:
plt.axvline(x = point, linestyle = '--', color = 'red', alpha = 0.5)
plt.text(point, wcss[point - 2] + 20, f' {point}', color = 'red', fontsize = 12)
# Add labels and title
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('Elbow Method For Optimal Number of Clusters')
# Add legend
plt.legend(['WCSS', 'Elbow Points'])
# Adjusting layout and display the plot
plt.tight_layout()
plt.show()
*SOLUTION (3 D.)*¶
Keeping k = 3 as the optimal number of clusters and training the K-means clustering model again
k = 3 # optimal number of clusters from above analysis
kmeans_optimal = KMeans(n_clusters = k, max_iter = 300, n_init = 15, random_state = 42)
kmeans_optimal
KMeans(n_clusters=3, n_init=15, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=3, n_init=15, random_state=42)
kmeans_optimal.fit(car_scaled)
KMeans(n_clusters=3, n_init=15, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=3, n_init=15, random_state=42)
wcss = kmeans_optimal.inertia_
labels = kmeans_optimal.labels_
# Print WCSS values
print("\nWCSS Values for k = 3:", wcss)
# Print number of iterations
print("\nNumber of Iterations to Converge for k = 3:", kmeans_optimal.n_iter_)
print("\nSilhouette Score for k = 3 - ", format(silhouette_score(car_scaled, labels), '.5f'))
# Checking on the centroids
cluster_centers = kmeans_optimal.cluster_centers_
cluster_centers
WCSS Values for k = 3: 1190.0436525504851 Number of Iterations to Converge for k = 3: 15 Silhouette Score for k = 3 - 0.32467
array([[-1.12726026, 1.4864187 , 1.46865697, 1.47338845, 1.36736404,
-1.03371846, -0.61198576, -0.71514478],
[-0.23301503, -0.12590632, -0.05634295, -0.24639985, 0.02416321,
0.43720956, 0.08226919, -0.60579879],
[ 0.89844152, -0.81610375, -0.86426529, -0.70547867, -0.86985567,
0.27002569, 0.31010992, 0.95968268]])
# Calculate the centroids for the columns to profile
car_scaled_df = pd.DataFrame(car_scaled, columns = car.columns)
car_scaled_df.head()
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 0 | -0.706439 | 1.498191 | 1.090604 | 0.673118 | 0.630870 | -1.295498 | -1.627426 | -0.715145 |
| 1 | -1.090751 | 1.498191 | 1.503514 | 1.589958 | 0.854333 | -1.477038 | -1.627426 | -0.715145 |
| 2 | -0.706439 | 1.498191 | 1.196232 | 1.197027 | 0.550470 | -1.658577 | -1.627426 | -0.715145 |
| 3 | -0.962647 | 1.498191 | 1.061796 | 1.197027 | 0.546923 | -1.295498 | -1.627426 | -0.715145 |
| 4 | -0.834543 | 1.498191 | 1.042591 | 0.935072 | 0.565841 | -1.840117 | -1.627426 | -0.715145 |
centroid_df = pd.DataFrame(cluster_centers, columns = car_scaled_df.columns)
centroid_df
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 0 | -1.127260 | 1.486419 | 1.468657 | 1.473388 | 1.367364 | -1.033718 | -0.611986 | -0.715145 |
| 1 | -0.233015 | -0.125906 | -0.056343 | -0.246400 | 0.024163 | 0.437210 | 0.082269 | -0.605799 |
| 2 | 0.898442 | -0.816104 | -0.864265 | -0.705479 | -0.869856 | 0.270026 | 0.310110 | 0.959683 |
From above, we see below for each cluster:
- Cluster 0: Vehicles with lower mpg, higher values for cylinders, displacement, horsepower, and weight
- Cluster 1: Vehicles with average features closer to the dataset's mean
- Cluster 2: Vehicles with higher mpg, lower values for cylinders, displacement, horsepower, and weight
# Analyzing the centroids to understand feature importance
centroid_means = centroid_df.mean()
print("Mean values of centroids:\n")
print(centroid_means)
Mean values of centroids: mpg -0.153945 cyl 0.181470 disp 0.182683 hp 0.173837 wt 0.173891 acc -0.108828 yr -0.073202 origin -0.120420 dtype: float64
# Plot centroids or feature importance across clusters
plt.figure(figsize = (10, 6))
plt.bar(centroid_df.columns, centroid_means)
plt.xlabel('Features')
plt.ylabel('Mean Centroid Value')
plt.title('Mean Centroid Values Across Features')
plt.xticks(rotation = 45)
plt.show()
From above, we see below points:
- Cylinders (cyl), Displacement (disp), Horsepower (hp), and Weight (wt) have positive mean centroid values. This indicates that these features are above the overall mean for the dataset within the clusters
- Miles per Gallon (mpg), Acceleration (acc), Year (yr), and Origin (origin) have negative mean centroid values. This suggests that these features are below the overall mean for the dataset within the clusters
- The features cyl, disp, hp, and wt have significantly higher positive centroid values compared to other features, indicating that these features are the most distinguishing characteristics of the clusters
- On the other hand, mpg has the most negative centroid value, highlighting a strong inverse relationship with the positive features, suggesting that vehicles with higher values in cyl, disp, hp, and wt have lower fuel efficiency
- The negative centroid value for mpg and positive centroid values for hp and wt suggest a trade-off between fuel efficiency and vehicle performance. Vehicles with higher horsepower and weight tend to have lower fuel efficiency
*SOLUTION (3 E.)*¶
# Adding the cluster labels as a new feature in the original DataFrame
car['Cluster'] = labels
# Printing random samples from the dataframe
car.sample(5, random_state = 42)
| mpg | cyl | disp | hp | wt | acc | yr | origin | Cluster | |
|---|---|---|---|---|---|---|---|---|---|
| 198 | 33.0 | 4 | 91.0 | 53.0 | 1795 | 17.4 | 76 | 3 | 2 |
| 396 | 28.0 | 4 | 120.0 | 79.0 | 2625 | 18.6 | 82 | 1 | 1 |
| 33 | 19.0 | 6 | 232.0 | 100.0 | 2634 | 13.0 | 71 | 1 | 1 |
| 208 | 13.0 | 8 | 318.0 | 150.0 | 3940 | 13.2 | 76 | 1 | 0 |
| 93 | 14.0 | 8 | 318.0 | 150.0 | 4237 | 14.5 | 73 | 1 | 0 |
car['Cluster'].value_counts()
Cluster 2 161 1 137 0 100 Name: count, dtype: int64
car['Cluster'].dtype
dtype('int32')
# Identifying important features distinguishing clusters
features_with_clusters = [f'{feat}_{i}' for i in range(len(cluster_centers)) for feat in car_scaled_df.columns]
centroid_values = cluster_centers.flatten()
feature_importance = pd.Series(centroid_values, index = features_with_clusters).sort_values(ascending = False)
# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({'Feature': features_with_clusters, 'Importance': feature_importance}).sort_values(by = 'Importance', ascending = False)
plt.figure(figsize = (15, 8))
sns.barplot(feature_importance_df, color = 'green', x = 'Feature', y = 'Importance')
plt.title(f"Feature Importance for Car Dataframe Clustered to 3 clusters")
plt.xticks(rotation = 90)
plt.show()
We have now labeled each of our car dataframe records cluster-wise for 3 clusters - 0, 1, and 2 and named that column as 'Cluster'.
Based on the feature importance, we can see below points for different clusters:
- Cluster 0 -
- cyl_0, hp_0, disp_0, wt_0 have a positive importance whereas mpg_0, acc_0, yr_0, origin_0 have a negative one
- This suggests that Cluster 0 is characterized by powerful and heavier vehicles however, they have lower miles per gallon and acceleration and are likely to be older
- So we can say that, Cluster 0 is characterized by powerful, heavier, and older vehicles with lower fuel efficiency
- Cluster 1 -
- With acc_1, yr_1 having a positive importance we an say that Vehicles in Cluster 1 have higher acceleration and are likely to be newer
- However, vehicles in Cluster 1 have lower fuel efficiency, fewer cylinders, lower displacement, lower horsepower indicated by negative importance for mpg_1, cyl_1, disp_1, hp_1
- This says that Cluster 1 consists of newer, more agile cars with lower power and fuel efficiency
- Cluster 2 -
- There is a positive importance for mpg_2, yr_2, acc_2 in Cluster 2 suggesting these are characterized by higher fuel efficiency, newer model years, and higher acceleration
- A negative importance for cyl_2, disp_2, wt_2, hp_2 suggest that Vehicles in Cluster 2 have fewer cylinders, lower displacement, are lighter, and have lower horsepower
- Inference from this is that Cluster 2 includes fuel-efficient, newer, and lighter vehicles with higher acceleration and potentially more environmentally friendly cars
*SOLUTION (3 F.)*¶
Below is the 3-D plot for MPG, HP and WT as we only have 3 dimensions so 3 features to list in the plot
## 3D plots of clusters
fig = plt.figure(figsize = (12, 8))
ax = fig.add_subplot(111, projection = '3d')
# Scatter plot
scatter = ax.scatter(car['mpg'], car['hp'], car['wt'], c = car['Cluster'], cmap = 'viridis', s = 100)
# Adding labels
ax.set_title('3D Scatter Plot of MPG, HP, and WT Colored by Cluster')
ax.set_xlabel('MPG')
ax.set_ylabel('HP')
ax.set_zlabel('WT')
# Adding legend
legend1 = ax.legend(*scatter.legend_elements(), title = "Clusters")
ax.add_artist(legend1)
plt.show()
Based on above 3-D model, we can say below points:
- Cluster 0 (Purple) contains vehicles with lower MPG values, indicating they are less fuel-efficient. These vehicles tend to have higher Horsepower (HP) and higher Weight (WT), suggesting they might be larger, more powerful vehicles
- Cluster 1 (Teal) vehicles have moderate MPG, HP, and WT values, indicating they strike a balance between power and fuel efficiency. These vehicles are likely mid-sized cars
- Cluster 2 (Yellow) is characterized by vehicles with high MPG values, indicating high fuel efficiency. The vehicles in this cluster generally have lower HP and WT values, suggesting they are smaller, lighter, and less powerful cars which are are optimized for economy rather than performance
Below is the pair plot visualization of all features where clusters are distinguished by colors. Please zoom the plot to view clearly
sns.pairplot(car, hue = 'Cluster', palette = 'coolwarm', markers=["o", "s", "D"])
plt.suptitle("Pair Plot of All Features Colored by Cluster", y = 1.02)
plt.show()
The visual plots for datapoints distingushed by Clusters with different colors can be summarized below:
- Cluster 0 (Blue) tends to have higher 'mpg' values and lower 'horsepower' and 'weight' values. This segment is likely to appeal to environmentally conscious consumers and those looking to save on fuel costs
- Cluster 2 (Red) tends to have lower 'mpg' values and higher 'horsepower' and 'weight' values. This segment of cars are more appealing to consumers looking for performance-oriented vehicles
- Cluster 1 (Grey) seems to have a more scattered and overlapping distribution with other clusters in some features.These vehicles may appeal to a broad audience looking for a balance between performance and fuel efficiency
*SOLUTION (3 G.)*¶
# Predicting which cluster a specific set of features belong to based on the datapoints
# Let's create a realistic model and take input from the user themselves
def get_user_input(prompt, data_type):
while True:
try:
user_input = data_type(input(prompt))
return user_input
except ValueError:
print("Invalid input. Please enter a valid value of type:", data_type)
print("Input Values for your Car Model\n")
mpg = get_user_input(" Enter MPG value: ", float)
cyl = get_user_input(" Enter Cylinder count: ", int)
disp = get_user_input(" Enter Displacement value: ", float)
hp = get_user_input(" Enter Horsepower value: ", float)
wt = get_user_input(" Enter Weight value: ", float)
acc = get_user_input(" Enter Acceleration value: ", float)
yr = get_user_input(" Enter Year: ", int)
origin = get_user_input(" Enter Origin: ", int)
# creating the new datapoint as an array
new_car_data = pd.DataFrame(np.array([[mpg, cyl, disp, hp, wt, acc, yr, origin]]), columns = car.columns[:-1])
# Since the original data for car dataframe was scaled, let's scale this new one also
new_car_data_scaled = standard_scaler.transform(new_car_data)
print("\n\nEntered Datapoint - ")
new_car_data
# sample data from original dataframe to confirm validation
# 13.0 8 318.0 150.0 3940 13.2 76 1 0
Input Values for your Car Model
Enter MPG value: 13
Enter Cylinder count: 8.4
Invalid input. Please enter a valid value of type: <class 'int'>
Enter Cylinder count: 8
Enter Displacement value: 318
Enter Horsepower value: 150
Enter Weight value: 3940
Enter Acceleration value: 13.2r
Invalid input. Please enter a valid value of type: <class 'float'>
Enter Acceleration value: 13.2
Enter Year: 76
Enter Origin: 1
Entered Datapoint -
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 0 | 13.0 | 8.0 | 318.0 | 150.0 | 3940.0 | 13.2 | 76.0 | 1.0 |
Here we have entered the same values as one of the actual records and the predicted value for this is Cluster 0. Let's see if we get the same result from our prediction also
# Predicting which cluster the new data point belongs to
predicted_cluster = kmeans_optimal.predict(new_car_data_scaled)
if predicted_cluster[0] == 0:
print(f"The new data point is predicted to belong to Cluster 0. \nThis type of vehicle is less fuel-efficient but might appeal to consumers who prioritize power, capacity, and features over fuel economy")
elif predicted_cluster[0] == 1:
print(f"The new data point is predicted to belong to Cluster 1. \nThis type of vehicle represents mid-range vehicles, possibly compact cars, or mid-sized sedans and appeals to a broad market seeking a balance between performance and fuel efficiency.")
elif predicted_cluster[0] == 2:
print(f"The new data point is predicted to belong to Cluster 2. \nThis type of vehicle likely represents more fuel-efficient vehicles, such as compact cars, hybrids, or electric vehicles and appeal to environmentally conscious consumers and those looking to save on fuel costs")
The new data point is predicted to belong to Cluster 0. This type of vehicle is less fuel-efficient but might appeal to consumers who prioritize power, capacity, and features over fuel economy
*We got the same result as Cluster 0 and hence validated our model clustering as correct one*
vehicle = pd.read_csv("C:/Users/pri96/OneDrive/Documents/AI and ML PGP/Module 5 - Unsupervised Learning (Week 17 to Week 19)/Project/vehicle.csv")
vehicle.head()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 95 | 48.0 | 83.0 | 178.0 | 72.0 | 10 | 162.0 | 42.0 | 20.0 | 159 | 176.0 | 379.0 | 184.0 | 70.0 | 6.0 | 16.0 | 187.0 | 197 | van |
| 1 | 91 | 41.0 | 84.0 | 141.0 | 57.0 | 9 | 149.0 | 45.0 | 19.0 | 143 | 170.0 | 330.0 | 158.0 | 72.0 | 9.0 | 14.0 | 189.0 | 199 | van |
| 2 | 104 | 50.0 | 106.0 | 209.0 | 66.0 | 10 | 207.0 | 32.0 | 23.0 | 158 | 223.0 | 635.0 | 220.0 | 73.0 | 14.0 | 9.0 | 188.0 | 196 | car |
| 3 | 93 | 41.0 | 82.0 | 159.0 | 63.0 | 9 | 144.0 | 46.0 | 19.0 | 143 | 160.0 | 309.0 | 127.0 | 63.0 | 6.0 | 10.0 | 199.0 | 207 | van |
| 4 | 85 | 44.0 | 70.0 | 205.0 | 103.0 | 52 | 149.0 | 45.0 | 19.0 | 144 | 241.0 | 325.0 | 188.0 | 127.0 | 9.0 | 11.0 | 180.0 | 183 | bus |
print("There are", vehicle.shape[0], "rows and", vehicle.shape[1], "columns in the dataframe")
There are 846 rows and 19 columns in the dataframe
*SOLUTION (1 B.)*¶
vehicle.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 846 entries, 0 to 845 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 compactness 846 non-null int64 1 circularity 841 non-null float64 2 distance_circularity 842 non-null float64 3 radius_ratio 840 non-null float64 4 pr.axis_aspect_ratio 844 non-null float64 5 max.length_aspect_ratio 846 non-null int64 6 scatter_ratio 845 non-null float64 7 elongatedness 845 non-null float64 8 pr.axis_rectangularity 843 non-null float64 9 max.length_rectangularity 846 non-null int64 10 scaled_variance 843 non-null float64 11 scaled_variance.1 844 non-null float64 12 scaled_radius_of_gyration 844 non-null float64 13 scaled_radius_of_gyration.1 842 non-null float64 14 skewness_about 840 non-null float64 15 skewness_about.1 845 non-null float64 16 skewness_about.2 845 non-null float64 17 hollows_ratio 846 non-null int64 18 class 846 non-null object dtypes: float64(14), int64(4), object(1) memory usage: 125.7+ KB
Based on the above information, we can infer below:
- All values are of numerical type, except class. We can use label encoding for this to convert it to numerical type feature however we won't be doing that right now and would see if it's required in future
- There are no null values for 5 features (class, hollow_ratio, max.length_rectangularity, max.length_aspect_ratio, compactness). rest all features have null values and require imputation. We'll impute them with their respective median values
# Check percentage of missing values in each column
missing_percentages = vehicle.isnull().mean() * 100
# Print missing percentages
print("Percentage of missing values in each column:")
print(missing_percentages)
vehicle.isnull().sum()
Percentage of missing values in each column: compactness 0.000000 circularity 0.591017 distance_circularity 0.472813 radius_ratio 0.709220 pr.axis_aspect_ratio 0.236407 max.length_aspect_ratio 0.000000 scatter_ratio 0.118203 elongatedness 0.118203 pr.axis_rectangularity 0.354610 max.length_rectangularity 0.000000 scaled_variance 0.354610 scaled_variance.1 0.236407 scaled_radius_of_gyration 0.236407 scaled_radius_of_gyration.1 0.472813 skewness_about 0.709220 skewness_about.1 0.118203 skewness_about.2 0.118203 hollows_ratio 0.000000 class 0.000000 dtype: float64
compactness 0 circularity 5 distance_circularity 4 radius_ratio 6 pr.axis_aspect_ratio 2 max.length_aspect_ratio 0 scatter_ratio 1 elongatedness 1 pr.axis_rectangularity 3 max.length_rectangularity 0 scaled_variance 3 scaled_variance.1 2 scaled_radius_of_gyration 2 scaled_radius_of_gyration.1 4 skewness_about 6 skewness_about.1 1 skewness_about.2 1 hollows_ratio 0 class 0 dtype: int64
columns_with_unexpected_values = []
for column in vehicle.columns:
unique_values = vehicle[column].unique()
unexpected_values = []
for value in unique_values:
if pd.isna(value): # Checking for NaN values
unexpected_values.append(value)
elif not pd.api.types.is_numeric_dtype(vehicle[column]) and not isinstance(value, str):
unexpected_values.append(value) # Checking for non-string non-numeric values, which is highly unlikely
if unexpected_values:
print(f"Column '{column}' has unexpected values: {unexpected_values}")
columns_with_unexpected_values.append(column)
# Checking for unexpected values across all datapoints (rows)
unexpected_rows = pd.DataFrame(vehicle[vehicle.isnull().any(axis = 1)])
if not unexpected_rows.empty:
print(f"\nAnd some of those unexpected values across below {len(unexpected_rows)} rows:\n\n")
else:
print("No unexpected values found across datapoints.")
unexpected_rows.head()
Column 'circularity' has unexpected values: [nan] Column 'distance_circularity' has unexpected values: [nan] Column 'radius_ratio' has unexpected values: [nan] Column 'pr.axis_aspect_ratio' has unexpected values: [nan] Column 'scatter_ratio' has unexpected values: [nan] Column 'elongatedness' has unexpected values: [nan] Column 'pr.axis_rectangularity' has unexpected values: [nan] Column 'scaled_variance' has unexpected values: [nan] Column 'scaled_variance.1' has unexpected values: [nan] Column 'scaled_radius_of_gyration' has unexpected values: [nan] Column 'scaled_radius_of_gyration.1' has unexpected values: [nan] Column 'skewness_about' has unexpected values: [nan] Column 'skewness_about.1' has unexpected values: [nan] Column 'skewness_about.2' has unexpected values: [nan] And some of those unexpected values across below 33 rows:
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | 107 | NaN | 106.0 | 172.0 | 50.0 | 6 | 255.0 | 26.0 | 28.0 | 169 | 280.0 | 957.0 | 264.0 | 85.0 | 5.0 | 9.0 | 181.0 | 183 | bus |
| 9 | 93 | 44.0 | 98.0 | NaN | 62.0 | 11 | 183.0 | 36.0 | 22.0 | 146 | 202.0 | 505.0 | 152.0 | 64.0 | 4.0 | 14.0 | 195.0 | 204 | car |
| 19 | 101 | 56.0 | 100.0 | 215.0 | NaN | 10 | 208.0 | 32.0 | 24.0 | 169 | 227.0 | 651.0 | 223.0 | 74.0 | 6.0 | 5.0 | 186.0 | 193 | car |
| 35 | 100 | 46.0 | NaN | 172.0 | 67.0 | 9 | 157.0 | 43.0 | 20.0 | 150 | 170.0 | 363.0 | 184.0 | 67.0 | 17.0 | 7.0 | 192.0 | 200 | van |
| 66 | 81 | 43.0 | 68.0 | 125.0 | 57.0 | 8 | 149.0 | 46.0 | 19.0 | 146 | 169.0 | 323.0 | 172.0 | NaN | NaN | 18.0 | 179.0 | 184 | bus |
# Imputing above instances of missing values with median
for column in columns_with_unexpected_values:
vehicle[column].fillna(vehicle.groupby('class')[column].transform('median'),inplace = True)
vehicle.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 846 entries, 0 to 845 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 compactness 846 non-null int64 1 circularity 846 non-null float64 2 distance_circularity 846 non-null float64 3 radius_ratio 846 non-null float64 4 pr.axis_aspect_ratio 846 non-null float64 5 max.length_aspect_ratio 846 non-null int64 6 scatter_ratio 846 non-null float64 7 elongatedness 846 non-null float64 8 pr.axis_rectangularity 846 non-null float64 9 max.length_rectangularity 846 non-null int64 10 scaled_variance 846 non-null float64 11 scaled_variance.1 846 non-null float64 12 scaled_radius_of_gyration 846 non-null float64 13 scaled_radius_of_gyration.1 846 non-null float64 14 skewness_about 846 non-null float64 15 skewness_about.1 846 non-null float64 16 skewness_about.2 846 non-null float64 17 hollows_ratio 846 non-null int64 18 class 846 non-null object dtypes: float64(14), int64(4), object(1) memory usage: 125.7+ KB
Now all columns have non null values. Let's check the 5 Point summary for the dataframe
vehicle.describe(include = 'all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| compactness | 846.0 | NaN | NaN | NaN | 93.678487 | 8.234474 | 73.0 | 87.0 | 93.0 | 100.0 | 119.0 |
| circularity | 846.0 | NaN | NaN | NaN | 44.826241 | 6.13434 | 33.0 | 40.0 | 44.0 | 49.0 | 59.0 |
| distance_circularity | 846.0 | NaN | NaN | NaN | 82.066194 | 15.754263 | 40.0 | 70.0 | 80.0 | 98.0 | 112.0 |
| radius_ratio | 846.0 | NaN | NaN | NaN | 168.916076 | 33.427561 | 104.0 | 141.0 | 167.25 | 195.0 | 333.0 |
| pr.axis_aspect_ratio | 846.0 | NaN | NaN | NaN | 61.680851 | 7.882557 | 47.0 | 57.0 | 61.0 | 65.0 | 138.0 |
| max.length_aspect_ratio | 846.0 | NaN | NaN | NaN | 8.567376 | 4.601217 | 2.0 | 7.0 | 8.0 | 10.0 | 55.0 |
| scatter_ratio | 846.0 | NaN | NaN | NaN | 168.920804 | 33.199802 | 112.0 | 147.0 | 157.0 | 198.0 | 265.0 |
| elongatedness | 846.0 | NaN | NaN | NaN | 40.927896 | 7.813401 | 26.0 | 33.0 | 43.0 | 46.0 | 61.0 |
| pr.axis_rectangularity | 846.0 | NaN | NaN | NaN | 20.579196 | 2.590879 | 17.0 | 19.0 | 20.0 | 23.0 | 29.0 |
| max.length_rectangularity | 846.0 | NaN | NaN | NaN | 147.998818 | 14.515652 | 118.0 | 137.0 | 146.0 | 159.0 | 188.0 |
| scaled_variance | 846.0 | NaN | NaN | NaN | 188.643026 | 31.37802 | 130.0 | 167.0 | 179.0 | 217.0 | 320.0 |
| scaled_variance.1 | 846.0 | NaN | NaN | NaN | 439.665485 | 176.492876 | 184.0 | 318.25 | 364.0 | 586.75 | 1018.0 |
| scaled_radius_of_gyration | 846.0 | NaN | NaN | NaN | 174.712766 | 32.546284 | 109.0 | 149.0 | 174.0 | 198.0 | 268.0 |
| scaled_radius_of_gyration.1 | 846.0 | NaN | NaN | NaN | 72.443262 | 7.470873 | 59.0 | 67.0 | 71.0 | 75.0 | 135.0 |
| skewness_about | 846.0 | NaN | NaN | NaN | 6.356974 | 4.904073 | 0.0 | 2.0 | 6.0 | 9.0 | 22.0 |
| skewness_about.1 | 846.0 | NaN | NaN | NaN | 12.604019 | 8.930921 | 0.0 | 5.0 | 11.0 | 19.0 | 41.0 |
| skewness_about.2 | 846.0 | NaN | NaN | NaN | 188.919622 | 6.152167 | 176.0 | 184.0 | 188.0 | 193.0 | 206.0 |
| hollows_ratio | 846.0 | NaN | NaN | NaN | 195.632388 | 7.438797 | 181.0 | 190.25 | 197.0 | 201.0 | 211.0 |
| class | 846 | 3 | car | 429 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
- There are 846 instances with 19 attributes (columns) including both numerical (18) and categorical (1) features
- SOme attributes like radius_ratio, pr.axis_aspect_ratio, max.length_aspect_ratio, scaled_variance.1, scaled_radius_of_gyration, and skewness_about.1 have relatively high standard deviations compared to their means, suggesting potential outliers or significant variability in the data
- The class attribute has 3 unique classes (car, van, bus) with car being the most frequent (429 instances). This suggests an imbalance where one class (car) dominates
- Compactness and circularity have mean and median values almost similar, which signifies that they both are normally distributed and have no skewness/outlier
- We can have further insights with various EDA
*SOLUTION (1 C.)*¶
# Count the occurrences of each class
class_counts = vehicle['class'].value_counts()
# Plotting a pie chart
plt.figure(figsize = (8, 6))
plt.pie(class_counts, labels = class_counts.index, autopct = '%1.1f%%', startangle = 140)
plt.title('Distribution of Classes')
plt.show()
# Print percentage of values for each class
print("Percentage of values for variable 'class':")
print(class_counts / len(vehicle) * 100)
Percentage of values for variable 'class': class car 50.709220 bus 25.768322 van 23.522459 Name: count, dtype: float64
Based on above pie-chart, we see that:
- Appproximately 50.7% of the vehicles in the dataset are classified as cars. The percetage division of buses and vans are ~25.8% and ~23.5% respectively
- The dataset is slightly imbalanced towards cars, which constitute more than half of the vehicles. Buses and vans make up the remaining portion, with buses being slightly more frequent than vans
We can also say that the models trained on this dataframe may be biased towards predicting 'car' instances more accurately due to their higher representation in the dataset
*SOLUTION (1 D.)*¶
duplicate_rows = vehicle[vehicle.duplicated()]
if not duplicate_rows.empty:
print(f"Number of duplicate rows: {len(duplicate_rows)}")
print("Duplicate rows:")
print(duplicate_rows)
else:
print("No duplicate rows found.")
No duplicate rows found.
There are no duplictae rows so no further steps required for impution/correctness
Before proceeding to next parts, let's have some analysis on the given dataset
PAIR PLOT¶
sns.pairplot(vehicle, diag_kind = 'kde', hue = 'class')
<seaborn.axisgrid.PairGrid at 0x191fe819810>